!pip install arch
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import ipywidgets as widgets
from IPython.display import display
w = widgets.Dropdown(
options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI',
'FB', 'GME', 'MCD','PFE', 'PLUG',
'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
value='SELECT',
description ='Stock name:',
)
def on_change(change):
if change['type'] == 'change' and change['name'] == 'value':
print("You have selected %s" % change['new'])
w.observe(on_change)
display(w)
if(w.value == 'AAPL'):
df = pd.read_csv('/content/Final_AAPL.csv')
if(w.value == 'ABUS'):
df = pd.read_csv('/content/Final_ABUS.csv')
if(w.value == 'ARDS'):
df = pd.read_csv('/content/Final_ARDS.csv')
if(w.value == 'BABA'):
df = pd.read_csv('/content/Final_BABA.csv')
if(w.value == 'BFRI'):
df = pd.read_csv('/content/Final_BFRI.csv')
if(w.value == 'FB'):
df = pd.read_csv('/content/Final_FB.csv')
if(w.value == 'GME'):
df = pd.read_csv('/content/Final_GME.csv')
if(w.value == 'MCD'):
df = pd.read_csv('/content/Final_MCD.csv')
if(w.value == 'PFE'):
df = pd.read_csv('/content/Final_PFE.csv')
if(w.value == 'PLUG'):
df = pd.read_csv('/content/Final_PLUG.csv')
if(w.value == 'QCOM'):
df = pd.read_csv('/content/Final_QCOM.csv')
if(w.value == 'SENS'):
df = pd.read_csv('/content/Final_SENS.csv')
if(w.value == 'TSLA'):
df = pd.read_csv('/content/Final_TSLA.csv')
if(w.value == 'TWTR'):
df = pd.read_csv('/content/Final_TWTR.csv')
if(w.value == 'UUUU'):
df = pd.read_csv('/content/Final_UUUU.csv')
pd.set_option('display.max_colwidth', None)
df['Date'] = df['Date'].astype("datetime64[ns]")
del df['Unnamed: 0']
df.head(5)
df.info()
df.shape
sns.set(font_scale=0.8)
# CHANGE CONTEXT TO poster TO INCREASE FONT SIZES
sns.set_context("talk", font_scale=1.3)
# PLOT OUT BTC-USE'S CLOSING PRICES SINCE 2014
with sns.axes_style("darkgrid"):
fig, ax = plt.subplots(figsize=(18,8))
sns.lineplot(x=df.Date, y=df.Close, color='blue')
ax.set_title('Closing Price')
# CALCULATE PRICE RETURNS AS DAILY PERCENTAGE CHANGE USING pct_change()
df['returns'] = 100 * df.Close.pct_change().dropna()
# CALCULATE LOG RETURNS BASED ON ABOVE FORMULA
df['log_returns'] = np.log(df.Close/df.Close.shift(1))
df.head()
# DROPPING THE 1ST ROW OF DATA
# BECAUSE I SHIFTED IT FORWARD TO CALCULATE RETURNS/LOG RETURNS
df.dropna(inplace=True)
# PLOT DISTRIBUTION PLOTS OF RETURNS & LOG RETURNS
# AND VISUALLY COMPARE THEM WITH THE STANDARD NORMAL DISTRIBUTION
with sns.axes_style("darkgrid"):
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,12))
axes[0][0].plot(df.returns, color='blue')
axes[0][0].set_title('Returns')
sns.distplot(df.returns, norm_hist=True, fit=stats.norm, color='blue',
bins=50, ax=axes[0][1])
axes[0][1].set_title('Returns')
axes[1][0].plot(df.log_returns, color='green')
axes[1][0].set_title('Log Returns')
sns.distplot(df.log_returns, norm_hist=True, fit=stats.norm, color='green',
bins=50, ax=axes[1][1])
axes[1][1].set_title('Log Returns')
plt.tight_layout()
fig.show();
# CREATE A FUNCTION THAT CALCULATE REALIZED VOLATILITY
# FROM SAILY LOG RETURNS
def realized_volatility_daily(series_log_return):
"""
Get the daily realized volatility which is calculated as the square root
of sum of squares of log returns within a specific window interval
"""
n = len(series_log_return)
return np.sqrt(np.sum(series_log_return**2)/(n - 1))
intervals = [7, 30, 60, 180, 365]
vols_df = {}
# ITERATE OVER intervals LIST
for i in intervals:
# GET DAILY LOG RETURNS USING THAT INTERVAL
vols = df.log_returns.rolling(window=i)\
.apply(realized_volatility_daily).values
vols_df[i] = vols
# CONVERT vols_df FROM DICTIONARY TO PANDAS DATAFRAME
vols_df = pd.DataFrame(vols_df, columns=intervals, index=df.index)
# CHANGING MATPLOTLIB STYLE
plt.style.use(['fivethirtyeight'])
fig, ax = plt.subplots(figsize=(18,7))
for i in intervals:
if i == 7:
alpha = 0.5
lw = 1
else:
alpha = 1.0
lw = 2
ax.plot(vols_df[i], label=f'{i}-Day Interval Realized Volatility',
alpha=alpha, lw=lw)
ax.set_title('Realized Volatility Using Different Interval Windows', fontsize=21)
plt.legend(loc='best', prop={'size': 14})
plt.show();
INTERVAL_WINDOW = 30
n_future = 7
# GET BACKWARD LOOKING REALIZED VOLATILITY
df['vol_current'] = df.log_returns.rolling(window=INTERVAL_WINDOW)\
.apply(realized_volatility_daily)
# GET FORWARD LOOKING REALIZED VOLATILITY
df['vol_future'] = df.log_returns.shift(-n_future)\
.rolling(window=INTERVAL_WINDOW)\
.apply(realized_volatility_daily)
df.describe()
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
df = df.fillna(df.median())
df.isna().sum()
df.info()
df.shape
df=df.dropna()
df.dtypes
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(40,15))
sns.heatmap(df.corr(),annot=True)
df.hist(figsize=(20, 32), bins=70, xlabelsize=8, ylabelsize=8);
df_corr = df.corr()['AvgTrueRange']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['NATR']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['TRANGE']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['O']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['C']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['E']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['A']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['N']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
df.columns
df_corr = df.corr()['B5_O_Um']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['B5_C_Um']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['B5_E_Um']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['B5_A_Um']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['B5_N_Um']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
Downward momentum correlation
df_corr = df.corr()['B5_O_Dm']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['B5_C_Dm']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['B5_E_Dm']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['B5_A_Dm']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['B5_N_Dm']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['Fake_news']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['Downward_momentum_created']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['Upward_momentum_created']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['Verified_status_True']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df.corr()['Verified_status_False']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
sns.set(font_scale=0.8)
for i in range(0, len(df.columns), 5):
sns.pairplot(data=df,
x_vars=df.columns[i:i+5],
y_vars=['NATR'])
df.dtypes
df.isnull().sum()
df.fillna(0, inplace = True)
df.dropna(inplace=True)
sns.set(font_scale=0.8)
corr = df.drop('Close', axis=1).corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)],
cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
annot=True, annot_kws={"size": 8}, square=True);
df.describe()
# DROPPING ALL NaN VALUES
df.dropna(inplace=True)
n_zoom = 365
sns.set_context("talk", font_scale=1.3)
# plt.style.use(['seaborn'])
# VISUALIZE REALIZED CURRENT VS. FUTURE VOLATILITY
with sns.axes_style("whitegrid"):
fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18,14))
ax1.plot(df.vol_current, alpha=.8, lw=1, color='gray', ls=':',
label='Current Volatility')
ax1.plot(df.vol_future, lw=1, color='blue',
label=f'Next {n_future} Days Volatility (TARGET)')
ax2.plot(df.vol_current[-n_zoom:], alpha=.8, lw=2, color='gray', ls=':',
label='Current Volatility')
ax2.plot(df.vol_future[-n_zoom:], lw=2, color='blue',
label=f'Next {n_future} Days Volatility (TARGET)')
ax1.title.set_text(f'Future vs. Current Daily Volatility \n Using {INTERVAL_WINDOW}-Day Interval')
ax2.title.set_text(f'Zooming in the Last {n_zoom} Days')
ax1.legend(loc='upper left', prop={'size': 13}, frameon=True)
ax2.legend(loc='upper left', prop={'size': 13}, frameon=True)
plt.tight_layout()
plt.show();
with sns.axes_style("darkgrid"):
fig, ax = plt.subplots(figsize=(10,6))
sns.distplot(df.vol_current, norm_hist=True, fit=stats.norm,
bins=50, ax=ax)
plt.title('Daily Volatility Distribution')
plt.show();
Experiment 2: weekly granularity
w = widgets.Dropdown(
options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI',
'FB', 'GME', 'MCD','PFE', 'PLUG',
'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
value='SELECT',
description ='Stock name:',
)
def on_change(change):
if change['type'] == 'change' and change['name'] == 'value':
print("You have selected %s" % change['new'])
w.observe(on_change)
display(w)
if(w.value == 'AAPL'):
df = pd.read_csv('/content/Final_AAPL.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ABUS'):
df = pd.read_csv('/content/Final_ABUS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ARDS'):
df = pd.read_csv('/content/Final_ARDS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BABA'):
df = pd.read_csv('/content/Final_BABA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BFRI'):
df = pd.read_csv('/content/Final_BFRI.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'FB'):
df = pd.read_csv('/content/Final_FB.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'GME'):
df = pd.read_csv('/content/Final_GME.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'MCD'):
df = pd.read_csv('/content/Final_MCD.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PFE'):
df = pd.read_csv('/content/Final_PFE.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PLUG'):
df = pd.read_csv('/content/Final_PLUG.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'QCOM'):
df = pd.read_csv('/content/Final_QCOM.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'SENS'):
df = pd.read_csv('/content/Final_SENS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TSLA'):
df = pd.read_csv('/content/Final_TSLA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TWTR'):
df = pd.read_csv('/content/Final_TWTR.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'UUUU'):
df = pd.read_csv('/content/Final_UUUU.csv', parse_dates=['Date'], index_col=['Date'])
df.columns
df.shape
df.isnull().sum()
df = df.fillna(df.median())
del df['Unnamed: 0']
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
df_weekly = df.resample('W').agg('mean')
df_weekly.shape
plt.figure(figsize=(40,15))
sns.heatmap(df_weekly.corr(),annot=True)
sns.set(font_scale=0.8)
df_weekly.hist(figsize=(20, 32), bins=50, xlabelsize=8, ylabelsize=8);
df_corr = df_weekly.corr()['AvgTrueRange']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['NATR']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['TRANGE']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['O']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['C']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['E']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['A']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['N']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['B5_O_Um']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['B5_C_Um']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['B5_E_Um']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['B5_A_Um']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['B5_N_Um']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
Downward momentum correlation
df_corr = df_weekly.corr()['B5_O_Dm']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['B5_C_Dm']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['B5_E_Dm']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['B5_A_Dm']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['B5_N_Dm']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['Fake_news']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['Downward_momentum_created']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['Upward_momentum_created']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['Verified_status_True']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
df_corr = df_weekly.corr()['Verified_status_False']
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
sns.set(font_scale=0.8)
for i in range(0, len(df_weekly.columns), 5):
sns.pairplot(data=df_weekly,
x_vars=df_weekly.columns[i:i+5],
y_vars=['NATR'])
df_weekly.fillna(0, inplace = True)
df_weekly.dropna(inplace=True)
corr = df_weekly.drop('Close', axis=1).corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)],
cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
annot=True, annot_kws={"size": 8}, square=True);
Weekly volatility distribution
with sns.axes_style("darkgrid"):
fig, ax = plt.subplots(figsize=(10,6))
sns.distplot(df_weekly.NATR, norm_hist=True, fit=stats.norm,
bins=50, ax=ax)
plt.title('Weekly Volatility Distribution')
plt.show();